Exploratory Data Analysis and Network Analysis¶
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from collections import Counter
from joblib import load, dump
from ipywidgets import widgets
import plotly.graph_objects as go
import plotly.express as px
from dash import html, dcc
from dash.dependencies import Input, Output
import matplotlib.gridspec as gridspec
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.stats import entropy
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import community as community_louvain
from adjustText import adjust_text
import seaborn as sns
import pandas as pd
import numpy as np
import random
import networkx as nx
import pyLDAvis
import dash
import string
import time
import os
import re
The dataframes imported from the previous notebook consist of a document topic distribution with each document being one 5000-word segment of a book and features about the texts covering the following attributes: 'title', 'author', 'date','gender', 'birthdate', 'nationality', 'source' always being given, as well as the following only filled for about 1/4 of the texts 'period', 'mode', 'genre', 'role' and 'polarity',
Additionally a number of features relevant for th topic explorations offered by pyLDAvis are imported as well.
df_txt_features_LDA=pd.read_csv('./analysis/df_txt_features_LDA.csv')
df_txt_features_CTM=pd.read_csv('./analysis/df_txt_features_CTM.csv')
df_txt_features_ETM=pd.read_csv('./analysis/df_txt_features_ETM.csv')
top_words_per_topic_LDA = ('./analysis/top_words_per_topic_LDA.joblib')
top_words_per_topic_CTM = ('./analysis/top_words_per_topic_CTM.joblib')
top_words_per_topic_ETM = ('./analysis/top_words_per_topic_ETM.joblib')
topic_term_dists_LDA = load('./analysis/topic_term_dists_LDA.joblib')
doc_topic_dists_LDA = load('./analysis/doc_topic_dists_LDA.joblib')
topic_term_dists_CTM = load('./analysis/topic_term_dists_CTM.joblib')
doc_topic_dists_CTM = load('./analysis/doc_topic_dists_CTM.joblib')
topic_term_dists_ETM = load('./analysis/topic_term_dists_ETM.joblib')
doc_topic_dists_ETM = load('./analysis/doc_topic_dists_ETM.joblib')
vocab = load('./analysis/vocab.joblib')
doc_lengths= load('./analysis/doc_lengths.joblib')
term_frequency = load('./analysis/term_frequency.joblib')
LDA¶
pyLDAvis offers an intuitive method for exploring the most important words for ach topic, the weight they carry within it and the relationship and distance between the given topics. For this multidimensional scaling reduces the topic term distribution to a two dimensional space, retaining both the importance of a given topic within the corpus, as well as their distance to one another with the help of Jensen-Shannon Divergence as its metric. A common approach for multi dimesnional scaling.
prepared_data = pyLDAvis.prepare(topic_term_dists_LDA, doc_topic_dists_LDA, doc_lengths, vocab, term_frequency)
pyLDAvis.display(prepared_data)
Topic Interpretation: Analyzing the intersection of the most salient and relevant terms for each topic, aiming to synthesize the underlying themes into coherent labels. Ennui, ants, firmness, confessor, vegetables, illusion, calculation, morbid, blasted, coolies, bayonets and terms of logical reasoning are strewn through out many topics.
topic_labels = {
"Topic 1": "Ominous Atmosphere - \n Spacial and Auditory Imagery: \n vastness, archaic, Refinement, Gloom, demons.",
"Topic 2": "Emotional Dialogue - \n Fear, Secrecy, Flattery,Arousal and Strife \n - Religion and Devils.",
"Topic 3": "Status and Individuality - \n Striving, Misery and Plentifullness - Excess.",
"Topic 4": "Myths, Trials and Death - \n Persecution of Crime, Telling Tales, magic and ants.",
"Topic 5": "Excitablity, Madness and Deceit - \n Aggression, conflict and glee.",
"Topic 6": "Nature and Reasoning - \n Creativity and Understanding, mixed with Nature.",
"Topic 7": "Social Pleasantries - \n Diplomacy, Plotting to Gossip.",
"Topic 8": "Faith, Convictions, Chivalry and Death - \n Erudition, Religion and Knights. Ants.",
"Topic 9": "Fortitude, Conviction and Adventure - \n Danger and social Station.",
"Topic 10": "Ferrocity and Tragedy - \n animalistic traits, intimacy, conflict and science.",
"Topic 11": "Ravens and Gloom - Longing, Death and Artifice.",
"Topic 12": "Home Invasion - Domestic Mystery and Conflict.",
"Topic 13": "Rituals and Festivities - \n Dance, Witchcraft and Coronations.",
"Topic 14": "Conflict, Animosity and Change - \n Emotional Changes, Death and Construction.",
"Topic 15": "Trickery and Science - \n Deceit, Reasoning and Institutions.",
"Topic 16": "Desecrated Chapel - \n Confessions and Defilement - Devils and Maniacs.",
"Topic 17": "(Un-)death, spectral bodies and judgement - \n human physicality, grief, emotions.",
"Topic 18": "Mystery and Adversity - \n Dream and fugue states, Investigation.",
"Topic 19": "Forlorn Carnival - Dances, Disgust and Intimacy.",
"Topic 20": "Science, Reasoning and Objects - \n Technology, Professions and Nature.",
"Topic 21": "War, Punishment and Exploration.",
"Topic 22": "Emotional Dynamics and Interactions.",
"Topic 23": "War, dreams and demons.",
"Topic 24": "Human Interactions and Emotional States.",
"Topic 25": "Flattery, clothing, Interactions.",
"Topic 26": "Witchcraft, Rituals and Fear of it - \n Banishment, Threats and Armor.",
"Topic 27": "Dragon Attack and Defense - \n Troops, Mountains and Cynicism.",
"Topic 28": "Communion in Nature - \n Transformation, Relationships and Identity.",
"Topic 29": "Bickering, Fighting and Mountains.",
"Topic 30": "Bureaucracy, Bargaining and Dissatisfaction.",
"Topic 31": "Exploration, Gloom, Caverns.",
"Topic 32": "Tranquility and Bustle - \n Terms of Relaxation, Calm and Action.",
"Topic 33": "Treacherous Company - on the run and scarred.",
"Topic 34": "Secrets and Suspense - \n Mystery, Devils and Assasinations.",
"Topic 35": "Mental Illness, Law and Outcasts - \n Fear, Suspicion and Struggles.",
"Topic 36": "Individualism vs. Conformity - \n Rebellion and Social Norms.",
"Topic 37": "Order and Chaos - \n Constrained Focus and Unchecked Emotions.",
"Topic 38": "Psychology, Trauma and Secrets.",
"Topic 39": "Quest for Meaning - Self-Discovery, Transformation.",
"Topic 40": "Ambition and Struggle - Emotional Turmoil.",
"Topic 41": "Despair, Isolation and Oppression.",
"Topic 42": "Illusion, Enchantment and Betrayal.",
"Topic 43": "Woodlands, Mystery, Illusion, Beasts.",
"Topic 44": "Companionship in Times of Trial and Distress.",
"Topic 45": "Intimacy, Emotions and Identity.",
"Topic 46": "Frustration, Society, Retreat into Nature - \n Society, Reason, Tension, negative Feelings, Forrests.",
"Topic 47": "Human Nature and the Connection to the Land, \n Myth and (Human) Nature - Solace, Inspiration, Acceptance for Hardships.",
"Topic 48": "Enthralling Garden full of Voices - \n Echantment and Vocalization, Nature.",
"Topic 49": "Departure and Music.",
"Topic 50": "Myth, Nature, Wonder and Despair.",
"Topic 51": "Dissilusionment with Society - \n Resistance, Protest, Retreat.",
"Topic 52": "Adventure, Spendor, Power and Challenges, History.",
"Topic 53": "Mercantile and Creativity - Haggling and Emotions.",
"Topic 54": "Medieval Cities, Castles and Courtship.",
"Topic 55": "Crocodiles, Massacres and Traveling.",
"Topic 56": "Exlporation of an Island and Obsession.",
"Topic 57": "Carnage near a Castle.",
"Topic 58": "Weddings and Rituals - Clamoring Throng.",
"Topic 59": "Judgment and Scrutiny - Tense Diplomacy.",
"Topic 60": "Confession and marriage before \n Conscription and Battle.",
"Topic 61": "Vapires, Ragality, Experiments, \n Festivities and Sacrifice.",
"Topic 62": "Dragons, Subterraneous Lairs, Riddles and Lore.",
"Topic 63": "Hidden Dangers, Fear, Anticipation, Supernatural.",
"Topic 64": "Artistic Ambition and Trials - Mastery and the Devil.",
"Topic 65": "Atmospheric Battle Descriptions and Royalty.",
"Topic 66": "Hidden Knowledge, Learning and Secrets.",
"Topic 67": "Monsters, Art, Romance - Myth and Gloom.",
"Topic 68": "Secluded Initiation Rites.",
"Topic 69": "Seduction, Deception, Violence, Bureaucracy.",
"Topic 70": "Myth and splendor - Wealth and Castles.",
"Topic 71": "Haunted Castles and their Prohecies.",
"Topic 72": "Festivities, Noise, Crowds.",
"Topic 73": "Camps, Trenches and Weather."
}
Visualizing the qualities of topics¶
We recreate the term relevance measure used in pyLDAvis and create some wordclouds for ease of comparison
def calculate_term_relevance(topic_term_dists, term_frequency, lambda_step=0.6):
"""
Calculate term relevance for each topic.
Relevance is defined as in pyLDAvis: lambda * log(prob of term given topic) +
(1 - lambda) * log(prob of term given topic / prob of term in corpus)
"""
# Convert term frequency to probability
term_prob = term_frequency / term_frequency.sum()
# Log probability of term given topic
log_prob_w_given_t = np.log(topic_term_dists + 1e-12) # Adding a small constant to avoid log(0)
# Log lift
log_lift = np.log(topic_term_dists / term_prob + 1e-12) # Adding a small constant to avoid division by zero
# Term relevance
term_relevance = lambda_step * log_prob_w_given_t + (1 - lambda_step) * log_lift
return term_relevance
def calculate_saliency(topic_term_dists, term_frequency):
"""
Calculate the saliency of terms according to the logic of pyLDAvis.
Saliency(term w) = frequency(w) * [sum_t p(t | w) * log(p(t | w)/p(t))]
"""
# Convert term frequency to probability
term_prob = term_frequency / term_frequency.sum()
# p(t | w)
p_t_given_w = topic_term_dists / topic_term_dists.sum(axis=1)[:, None]
# p(t)
p_t = topic_term_dists.sum(axis=0) / topic_term_dists.sum().sum()
# Calculating saliency
saliency = term_prob * np.sum(p_t_given_w * np.log(p_t_given_w / p_t), axis=0)
return saliency
def generate_word_clouds(term_relevance, saliency, topic_term_dists_LDA, vocab, n_topics):
wc_width, wc_height = 200, 200 # wc size in pixels
# Create subplot grid
fig, axs = plt.subplots(nrows=19, ncols=8, figsize=(36, 85)) # 8 rows, 8 columns
axs = axs.flatten()
for i in range(n_topics):
# Generate salient word cloud
topic_saliency = saliency * topic_term_dists_LDA[i, :]
top_salient_terms = topic_saliency.argsort()[-30:][::-1]
salient_word_freq = {vocab[term]: topic_saliency[term] for term in top_salient_terms}
salient_wc = WordCloud(width=wc_width, height=wc_height, background_color='white', colormap='Greens').generate_from_frequencies(salient_word_freq)
axs[i*2].imshow(salient_wc, interpolation='bilinear')
axs[i*2].axis('off')
axs[i*2].set_title(f'Topic {i+1} - Salient', fontsize=23)
# Generate relevant word cloud
topic_relevance = term_relevance[i, :]
top_relevant_terms = topic_relevance.argsort()[-30:][::-1]
relevant_word_freq = {vocab[term]: topic_relevance[term] for term in top_relevant_terms}
relevant_wc = WordCloud(width=wc_width, height=wc_height, background_color='white', colormap='Reds').generate_from_frequencies(relevant_word_freq)
axs[i*2+1].imshow(relevant_wc, interpolation='bilinear')
axs[i*2+1].axis('off')
axs[i*2+1].set_title(f'Topic {i+1} - Relevant', fontsize=23)
# Hide the remaining axes
for i in range(n_topics*2, len(axs)):
axs[i].set_visible(False)
# Adjust layout and spacing
plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.tight_layout()
plt.show()
term_relevance = calculate_term_relevance(topic_term_dists_LDA, np.array(term_frequency))
saliency = calculate_saliency(topic_term_dists_LDA, np.array(term_frequency))
generate_word_clouds(term_relevance, saliency, topic_term_dists_LDA, vocab, topic_term_dists_LDA.shape[0])